{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Otto商品分类——数据探索&特征工程"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Otto数据集是著名电商Otto提供的一个多类商品分类问题，类别数=9. 每个样本有93维数值型特征（整数，可能表示某种事件发生的次数，已经进行过脱敏处理）。 \n",
    "竞赛官网：https://www.kaggle.com/c/otto-group-product-classification-challenge/data\n",
    "\n",
    "\n",
    "第一名：https://www.kaggle.com/c/otto-group-product-classification-challenge/discussion/14335\n",
    "\n",
    "第二名：http://blog.kaggle.com/2015/06/09/otto-product-classification-winners-interview-2nd-place-alexander-guschin/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 首先 import 必要的模块\n",
    "import pandas as pd \n",
    "import numpy as np\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>feat_1</th>\n",
       "      <th>feat_2</th>\n",
       "      <th>feat_3</th>\n",
       "      <th>feat_4</th>\n",
       "      <th>feat_5</th>\n",
       "      <th>feat_6</th>\n",
       "      <th>feat_7</th>\n",
       "      <th>feat_8</th>\n",
       "      <th>feat_9</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_85</th>\n",
       "      <th>feat_86</th>\n",
       "      <th>feat_87</th>\n",
       "      <th>feat_88</th>\n",
       "      <th>feat_89</th>\n",
       "      <th>feat_90</th>\n",
       "      <th>feat_91</th>\n",
       "      <th>feat_92</th>\n",
       "      <th>feat_93</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 95 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \\\n",
       "0   1       1       0       0       0       0       0       0       0       0   \n",
       "1   2       0       0       0       0       0       0       0       1       0   \n",
       "2   3       0       0       0       0       0       0       0       1       0   \n",
       "3   4       1       0       0       1       6       1       5       0       0   \n",
       "4   5       0       0       0       0       0       0       0       0       0   \n",
       "\n",
       "   ...  feat_85  feat_86  feat_87  feat_88  feat_89  feat_90  feat_91  \\\n",
       "0  ...        1        0        0        0        0        0        0   \n",
       "1  ...        0        0        0        0        0        0        0   \n",
       "2  ...        0        0        0        0        0        0        0   \n",
       "3  ...        0        1        2        0        0        0        0   \n",
       "4  ...        1        0        0        0        0        1        0   \n",
       "\n",
       "   feat_92  feat_93   target  \n",
       "0        0        0  Class_1  \n",
       "1        0        0  Class_1  \n",
       "2        0        0  Class_1  \n",
       "3        0        0  Class_1  \n",
       "4        0        0  Class_1  \n",
       "\n",
       "[5 rows x 95 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读取数据\n",
    "# path to where the data lies\n",
    "dpath = './data/'\n",
    "train = pd.read_csv(dpath +\"Otto_train.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 61878 entries, 0 to 61877\n",
      "Data columns (total 95 columns):\n",
      "id         61878 non-null int64\n",
      "feat_1     61878 non-null int64\n",
      "feat_2     61878 non-null int64\n",
      "feat_3     61878 non-null int64\n",
      "feat_4     61878 non-null int64\n",
      "feat_5     61878 non-null int64\n",
      "feat_6     61878 non-null int64\n",
      "feat_7     61878 non-null int64\n",
      "feat_8     61878 non-null int64\n",
      "feat_9     61878 non-null int64\n",
      "feat_10    61878 non-null int64\n",
      "feat_11    61878 non-null int64\n",
      "feat_12    61878 non-null int64\n",
      "feat_13    61878 non-null int64\n",
      "feat_14    61878 non-null int64\n",
      "feat_15    61878 non-null int64\n",
      "feat_16    61878 non-null int64\n",
      "feat_17    61878 non-null int64\n",
      "feat_18    61878 non-null int64\n",
      "feat_19    61878 non-null int64\n",
      "feat_20    61878 non-null int64\n",
      "feat_21    61878 non-null int64\n",
      "feat_22    61878 non-null int64\n",
      "feat_23    61878 non-null int64\n",
      "feat_24    61878 non-null int64\n",
      "feat_25    61878 non-null int64\n",
      "feat_26    61878 non-null int64\n",
      "feat_27    61878 non-null int64\n",
      "feat_28    61878 non-null int64\n",
      "feat_29    61878 non-null int64\n",
      "feat_30    61878 non-null int64\n",
      "feat_31    61878 non-null int64\n",
      "feat_32    61878 non-null int64\n",
      "feat_33    61878 non-null int64\n",
      "feat_34    61878 non-null int64\n",
      "feat_35    61878 non-null int64\n",
      "feat_36    61878 non-null int64\n",
      "feat_37    61878 non-null int64\n",
      "feat_38    61878 non-null int64\n",
      "feat_39    61878 non-null int64\n",
      "feat_40    61878 non-null int64\n",
      "feat_41    61878 non-null int64\n",
      "feat_42    61878 non-null int64\n",
      "feat_43    61878 non-null int64\n",
      "feat_44    61878 non-null int64\n",
      "feat_45    61878 non-null int64\n",
      "feat_46    61878 non-null int64\n",
      "feat_47    61878 non-null int64\n",
      "feat_48    61878 non-null int64\n",
      "feat_49    61878 non-null int64\n",
      "feat_50    61878 non-null int64\n",
      "feat_51    61878 non-null int64\n",
      "feat_52    61878 non-null int64\n",
      "feat_53    61878 non-null int64\n",
      "feat_54    61878 non-null int64\n",
      "feat_55    61878 non-null int64\n",
      "feat_56    61878 non-null int64\n",
      "feat_57    61878 non-null int64\n",
      "feat_58    61878 non-null int64\n",
      "feat_59    61878 non-null int64\n",
      "feat_60    61878 non-null int64\n",
      "feat_61    61878 non-null int64\n",
      "feat_62    61878 non-null int64\n",
      "feat_63    61878 non-null int64\n",
      "feat_64    61878 non-null int64\n",
      "feat_65    61878 non-null int64\n",
      "feat_66    61878 non-null int64\n",
      "feat_67    61878 non-null int64\n",
      "feat_68    61878 non-null int64\n",
      "feat_69    61878 non-null int64\n",
      "feat_70    61878 non-null int64\n",
      "feat_71    61878 non-null int64\n",
      "feat_72    61878 non-null int64\n",
      "feat_73    61878 non-null int64\n",
      "feat_74    61878 non-null int64\n",
      "feat_75    61878 non-null int64\n",
      "feat_76    61878 non-null int64\n",
      "feat_77    61878 non-null int64\n",
      "feat_78    61878 non-null int64\n",
      "feat_79    61878 non-null int64\n",
      "feat_80    61878 non-null int64\n",
      "feat_81    61878 non-null int64\n",
      "feat_82    61878 non-null int64\n",
      "feat_83    61878 non-null int64\n",
      "feat_84    61878 non-null int64\n",
      "feat_85    61878 non-null int64\n",
      "feat_86    61878 non-null int64\n",
      "feat_87    61878 non-null int64\n",
      "feat_88    61878 non-null int64\n",
      "feat_89    61878 non-null int64\n",
      "feat_90    61878 non-null int64\n",
      "feat_91    61878 non-null int64\n",
      "feat_92    61878 non-null int64\n",
      "feat_93    61878 non-null int64\n",
      "target     61878 non-null object\n",
      "dtypes: int64(94), object(1)\n",
      "memory usage: 44.8+ MB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "93个整数型匿名特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>feat_1</th>\n",
       "      <th>feat_2</th>\n",
       "      <th>feat_3</th>\n",
       "      <th>feat_4</th>\n",
       "      <th>feat_5</th>\n",
       "      <th>feat_6</th>\n",
       "      <th>feat_7</th>\n",
       "      <th>feat_8</th>\n",
       "      <th>feat_9</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_84</th>\n",
       "      <th>feat_85</th>\n",
       "      <th>feat_86</th>\n",
       "      <th>feat_87</th>\n",
       "      <th>feat_88</th>\n",
       "      <th>feat_89</th>\n",
       "      <th>feat_90</th>\n",
       "      <th>feat_91</th>\n",
       "      <th>feat_92</th>\n",
       "      <th>feat_93</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.00000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61878.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>30939.500000</td>\n",
       "      <td>0.38668</td>\n",
       "      <td>0.263066</td>\n",
       "      <td>0.901467</td>\n",
       "      <td>0.779081</td>\n",
       "      <td>0.071043</td>\n",
       "      <td>0.025696</td>\n",
       "      <td>0.193704</td>\n",
       "      <td>0.662433</td>\n",
       "      <td>1.011296</td>\n",
       "      <td>...</td>\n",
       "      <td>0.070752</td>\n",
       "      <td>0.532306</td>\n",
       "      <td>1.128576</td>\n",
       "      <td>0.393549</td>\n",
       "      <td>0.874915</td>\n",
       "      <td>0.457772</td>\n",
       "      <td>0.812421</td>\n",
       "      <td>0.264941</td>\n",
       "      <td>0.380119</td>\n",
       "      <td>0.126135</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>17862.784315</td>\n",
       "      <td>1.52533</td>\n",
       "      <td>1.252073</td>\n",
       "      <td>2.934818</td>\n",
       "      <td>2.788005</td>\n",
       "      <td>0.438902</td>\n",
       "      <td>0.215333</td>\n",
       "      <td>1.030102</td>\n",
       "      <td>2.255770</td>\n",
       "      <td>3.474822</td>\n",
       "      <td>...</td>\n",
       "      <td>1.151460</td>\n",
       "      <td>1.900438</td>\n",
       "      <td>2.681554</td>\n",
       "      <td>1.575455</td>\n",
       "      <td>2.115466</td>\n",
       "      <td>1.527385</td>\n",
       "      <td>4.597804</td>\n",
       "      <td>2.045646</td>\n",
       "      <td>0.982385</td>\n",
       "      <td>1.201720</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>15470.250000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>30939.500000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>46408.750000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>61878.000000</td>\n",
       "      <td>61.00000</td>\n",
       "      <td>51.000000</td>\n",
       "      <td>64.000000</td>\n",
       "      <td>70.000000</td>\n",
       "      <td>19.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>76.000000</td>\n",
       "      <td>43.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>76.000000</td>\n",
       "      <td>55.000000</td>\n",
       "      <td>65.000000</td>\n",
       "      <td>67.000000</td>\n",
       "      <td>30.000000</td>\n",
       "      <td>61.000000</td>\n",
       "      <td>130.000000</td>\n",
       "      <td>52.000000</td>\n",
       "      <td>19.000000</td>\n",
       "      <td>87.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8 rows × 94 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                 id       feat_1        feat_2        feat_3        feat_4  \\\n",
       "count  61878.000000  61878.00000  61878.000000  61878.000000  61878.000000   \n",
       "mean   30939.500000      0.38668      0.263066      0.901467      0.779081   \n",
       "std    17862.784315      1.52533      1.252073      2.934818      2.788005   \n",
       "min        1.000000      0.00000      0.000000      0.000000      0.000000   \n",
       "25%    15470.250000      0.00000      0.000000      0.000000      0.000000   \n",
       "50%    30939.500000      0.00000      0.000000      0.000000      0.000000   \n",
       "75%    46408.750000      0.00000      0.000000      0.000000      0.000000   \n",
       "max    61878.000000     61.00000     51.000000     64.000000     70.000000   \n",
       "\n",
       "             feat_5        feat_6        feat_7        feat_8        feat_9  \\\n",
       "count  61878.000000  61878.000000  61878.000000  61878.000000  61878.000000   \n",
       "mean       0.071043      0.025696      0.193704      0.662433      1.011296   \n",
       "std        0.438902      0.215333      1.030102      2.255770      3.474822   \n",
       "min        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "25%        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "50%        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "75%        0.000000      0.000000      0.000000      1.000000      0.000000   \n",
       "max       19.000000     10.000000     38.000000     76.000000     43.000000   \n",
       "\n",
       "       ...       feat_84       feat_85       feat_86       feat_87  \\\n",
       "count  ...  61878.000000  61878.000000  61878.000000  61878.000000   \n",
       "mean   ...      0.070752      0.532306      1.128576      0.393549   \n",
       "std    ...      1.151460      1.900438      2.681554      1.575455   \n",
       "min    ...      0.000000      0.000000      0.000000      0.000000   \n",
       "25%    ...      0.000000      0.000000      0.000000      0.000000   \n",
       "50%    ...      0.000000      0.000000      0.000000      0.000000   \n",
       "75%    ...      0.000000      0.000000      1.000000      0.000000   \n",
       "max    ...     76.000000     55.000000     65.000000     67.000000   \n",
       "\n",
       "            feat_88       feat_89       feat_90       feat_91       feat_92  \\\n",
       "count  61878.000000  61878.000000  61878.000000  61878.000000  61878.000000   \n",
       "mean       0.874915      0.457772      0.812421      0.264941      0.380119   \n",
       "std        2.115466      1.527385      4.597804      2.045646      0.982385   \n",
       "min        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "25%        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "50%        0.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "75%        1.000000      0.000000      0.000000      0.000000      0.000000   \n",
       "max       30.000000     61.000000    130.000000     52.000000     19.000000   \n",
       "\n",
       "            feat_93  \n",
       "count  61878.000000  \n",
       "mean       0.126135  \n",
       "std        1.201720  \n",
       "min        0.000000  \n",
       "25%        0.000000  \n",
       "50%        0.000000  \n",
       "75%        0.000000  \n",
       "max       87.000000  \n",
       "\n",
       "[8 rows x 94 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 各属性的统计特性\n",
    "train.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 标签的分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZEAAAEHCAYAAABvHnsJAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAfFklEQVR4nO3de7xWZZ338c9XEA+VgYFGHNrYbHtFZmU7NX3MUynYBNZoj04FL/WJp0bNejqg2agjUdrJyUobGkmsRiTLxLKQTLQpUcADCOqwQ9MtJBioVKMO+nv+WNeOxfbem8Xa99H9fb9e92uv9VvXuq/fvcX9u691uhQRmJmZlbFToxMwM7PW5SJiZmaluYiYmVlpLiJmZlaai4iZmZU2uNEJ1Nvw4cOjra2t0WmYmbWUZcuWPRERI3rGB1wRaWtrY+nSpY1Ow8yspUj6Q6W4D2eZmVlpLiJmZlaai4iZmZXmImJmZqW5iJiZWWkuImZmVpqLiJmZlVazIiJptqT1ku7rET9T0oOSVkr6ci5+jqTOtO3YXHxCinVKOjsXHyfpDkmrJV0jaUitPouZmVVWy5HIlcCEfEDSkcBkYP+IeCPw1RQfD5wEvDHtc5mkQZIGAd8GJgLjgZNTW4CLgUsioh3YBJxWw89iZmYV1OyO9Yi4TVJbj/DHgIsi4tnUZn2KTwbmpvhDkjqBA9O2zohYAyBpLjBZ0v3AUcA/pjZzgAuAy2vzaerrkQvfVPc+x563ou59mlnrq/c5kX2Bw9JhqFslvT3FRwGP5tp1pVhv8VcBT0bElh7xiiRNk7RU0tINGzZU6aOYmVm9i8hgYBhwMPAZYJ4kAarQNkrEK4qIWRHREREdI0a86PlhZmZWUr0fwNgF/CSyid3vlPQCMDzFx+TajQbWpuVK8SeAoZIGp9FIvr2ZmdVJvUciPyU7l4GkfYEhZAVhPnCSpF0kjQPagTuBJUB7uhJrCNnJ9/mpCN0CnJDedypwfV0/iZmZ1W4kIulq4AhguKQu4HxgNjA7Xfb7HDA1FYSVkuYBq4AtwOkR8Xx6nzOABcAgYHZErExdTAfmSvoCcDdwRa0+i5mZVVbLq7NO7mXTh3ppPxOYWSF+I3Bjhfgatl7BZWZmDeA71s3MrDQXETMzK81FxMzMSnMRMTOz0lxEzMysNBcRMzMrzUXEzMxKcxExM7PSXETMzKw0FxEzMyvNRcTMzEpzETEzs9JcRMzMrDQXETMzK81FxMzMSnMRMTOz0lxEzMystJoVEUmzJa1PU+H23PZpSSFpeFqXpEsldUpaLumAXNupklan19Rc/G2SVqR9LpWkWn0WMzOrrGbT4wJXAt8CrsoHJY0B3g08kgtPBNrT6yDgcuAgSXuSzc3eAQSwTNL8iNiU2kwDFpNNnzsB+EUNP49ZS5r5oRMa0u+5P7i2If1afdVsJBIRtwEbK2y6BPgsWVHoNhm4KjKLgaGSRgLHAgsjYmMqHAuBCWnbHhFxe0QEWaE6vlafxczMKqvrORFJk4DHIuLeHptGAY/m1rtSrK94V4V4b/1Ok7RU0tINGzb04xOYmVle3YqIpN2Bc4HzKm2uEIsS8YoiYlZEdEREx4gRI4qka2ZmBdRzJPI6YBxwr6SHgdHAXZJeTTaSGJNrOxpYu5346ApxMzOro7oVkYhYERF7RURbRLSRFYIDIuKPwHxgSrpK62DgqYhYBywAjpE0TNIw4BhgQdq2WdLB6aqsKcD19fosZmaWqeUlvlcDtwOvl9Ql6bQ+mt8IrAE6ge8C/wQQERuBGcCS9LowxQA+Bvx72uf3+MosM7O6q9klvhFx8na2t+WWAzi9l3azgdkV4kuB/fqXpZmZ9YfvWDczs9JcRMzMrDQXETMzK81FxMzMSnMRMTOz0lxEzMysNBcRMzMrzUXEzMxK224RkfQySTul5X0lTZK0c+1TMzOzZldkJHIbsKukUcDNwClkE06ZmdkAV6SIKCL+Crwf+GZEvA8YX9u0zMysFRQqIpLeAXwQ+HmK1XJaXTMzaxFFisgngHOA6yJipaR9gFtqm5aZmbWC7Y4oIuJW4FZJL0vra4CP1zoxMzNrfkWuznqHpFXA/Wn9zZIuq3lmZmbW9IoczvpX4FjgTwARcS/wzlomZWZmraHQzYYR8WiP0PM1yMXMzFpMkSLyqKRDgJA0RNKnSYe2+iJptqT1ku7Lxb4i6QFJyyVdJ2lobts5kjolPSjp2Fx8Qop1Sjo7Fx8n6Q5JqyVdI2lI4U9tZmZVUaSIfJRs6tpRQBfwFnqZyraHK4EJPWILgf0iYn/gv8iu+kLSeOAk4I1pn8skDZI0CPg2MJHs3pSTU1uAi4FLIqId2AT0NYe7mZnVwHaLSEQ8EREfjIi9I2KviPhQRPypwH63ARt7xG6KiC1pdTEwOi1PBuZGxLMR8RDQCRyYXp0RsSYingPmApMlCTgKuDbtPwc4fruf1szMqqrI1Vlzehx2GiZpdhX6PhX4RVoeBeTPu3SlWG/xVwFP5gpSd7wiSdMkLZW0dMOGDVVI3czMoNjhrP0j4snulYjYBLy1P51KOhfYAvywO1ShWZSIVxQRsyKiIyI6RowYsaPpmplZL4o8vmQnScNS8UDSngX3q0jSVODvgaMjovsPfxcwJtdsNLA2LVeKPwEMlTQ4jUby7c3MrE6KjES+BvxO0gxJM4DfAV8u05mkCcB0YFJ6qGO3+cBJknaRNA5oB+4ElgDt6UqsIWQn3+en4nMLcELafypwfZmczMysvCKPPblK0jLgSLLDSO+PiFXb20/S1cARwHBJXcD5ZFdj7QIszM6NszgiPpqeyTUPWEV2mOv0iHg+vc8ZwAJgEDA7IlamLqYDcyV9AbgbuKL4xzYzs2ooeljqAbLLaAcDSBobEY/0tUNEnFwh3Osf+oiYCcysEL8RuLFCfA3Z1VtmZtYg2y0iks4kG0U8TnanushOYu9f29TMzKzZFRmJnAW8vsi9IWZmNrAUeuwJ8FStEzEzs9ZTZCSyBlgk6efAs93BiPh6zbIyM7OWUKSIPJJeQ9LLzMwMKHaJ778ASHpZRPyl9imZmVmr8MyGZmZWmmc2NDOz0jyzoZmZlVbkxPo2MxsCH6fAzIZmZvbSV8uZDc3M7CWuz5FImp72wxHxwTrlY2ZmLaTPkUh6ku7kOuViZmYtpsg5kd9K+hZwDfC3+0Qi4q6aZWVmZi2hSBE5JP28MBcL4Kjqp2NmZq1ke+dEdgIuj4h5dcrHzMxayPbOibwAnFHmjSXNlrRe0n252J6SFkpanX4OS3FJulRSp6Tlkg7I7TM1tV+d5mfvjr9N0oq0z6VKUyWamVn9FLnEd6GkT0sak4rAnpL2LLDflcCEHrGzgZsjoh24Oa0DTCSbV70dmAZcDlnRIZsQ6yCyWQzP7y48qc203H49+zIzsxorck7k1PQzf29IAPv0tVNE3CaprUd4Mtm86wBzgEVkc6VPBq6KiAAWSxoqaWRquzAiNgJIWghMkLQI2CMibk/xq4DjgV8U+DxmZlYlRZ7iO66K/e0dEevS+66TtFeKjyKb/KpbV4r1Fe+qEK9I0jSyUQtjx47t50cwM7NuReZYn1IpHhFXVTGPSuczokS8ooiYBcwC6Ojo6LWdmZntmCKHs96eW94VOBq4CyhTRB6XNDKNQkYC61O8CxiTazcaWJviR/SIL0rx0RXam5lZHW33xHpEnJl7fQR4K+VnOJwPdF9hNRW4Phefkq7SOhh4Kh32WgAcI2lYOqF+DLAgbdss6eB0VdaU3HuZmVmdFBmJ9PRXsquh+iTparJRxHBJXWRXWV0EzJN0GtmUuyem5jcCxwGd6f1PAYiIjZJmAEtSuwu7T7IDHyO7Amw3shPqPqluZlZnRc6J3MDW8w07AeOB7d58GBEn97Lp6Aptg16eDBwRs4HZFeJLgf22l4eZmdVOkZHIV3PLW4A/RERXb43NzGzgKFJEHgHWRcQzAJJ2k9QWEQ/XNDMzM2t6Re5Y/xHwQm79+RQzM7MBrkgRGRwRz3WvpOWyV2eZmdlLSJEiskHSpO4VSZOBJ2qXkpmZtYoi50Q+CvwwTUwF2Y1+Fe9iNzOzgaXIs7N+Dxws6eWAImJz7dMyM7NWsN3DWZK+KGloRPw5Ijanu8e/UI/kzMysuRU5JzIxIp7sXomITWR3l5uZ2QBXpIgMkrRL94qk3YBd+mhvZmYDRJET6z8Abpb0PbLHn5xKNqGUmZkNcEVOrH9Z0nLgXSk0IyIW1DYtMzNrBUWf4ns3sDPZSOTu2qVjZmatpMjVWR8A7gROAD4A3CHphFonZmZmza/ISORc4O0RsR5A0gjgV8C1tUzMzKzeLrjgggHVbzUUuTprp+4Ckvyp4H5mZvYSV2Qk8ktJC4Cr0/r/JpuJ0MzMBrgic6x/Bvg3YH/gzcCsiJjen04lfVLSSkn3Sbpa0q6Sxkm6Q9JqSddIGpLa7pLWO9P2ttz7nJPiD0o6tj85mZnZjit0WCoifhIR/y8iPhkR1/WnQ0mjgI8DHRGxHzAIOAm4GLgkItqBTcBpaZfTgE0R8XfAJakdksan/d4ITAAukzSoP7mZmdmOadS5jcHAbpIGA7sD64Cj2Hqyfg5wfFqezNabG68FjpakFJ8bEc9GxENAJ3BgnfI3MzOK3ydSNRHxmKSvkk27+9/ATcAy4MmI2JKadQGj0vIo4NG07xZJTwGvSvHFubfO77MNSdOAaQBjx46t6ucZKA795qEN6fe3Z/62If2aWTG9jkQk3Zx+XlzNDiUNIxtFjANeA7wMmFihaXTv0su23uIvDkbMioiOiOgYMWLEjidtZmYV9TUSGSnpcGCSpLn0+KMdEXeV7PNdwEMRsQFA0k+AQ4Chkgan0choYG1q3wWMAbrS4a9XAhtz8W75fczMrA76KiLnAWeT/XH+eo9tQXYOo4xHyCa52p3scNbRwFLgFrK74ucCU4HrU/v5af32tP3XERGS5gP/IenrZCOadrI7683MrE56LSIRcS1wraR/jogZ1eowIu6QdC1wF7CF7Flcs4CfA3PThFd3A1ekXa4Avi+pk2wEclJ6n5WS5gGr0vucHhHPVytPMzPbviJP8Z0haRLwzhRaFBE/60+nEXE+cH6P8BoqXF0VEc8AJ/byPjOBmf3JxczMyivyAMYvAWeRfeNfBZyVYmZmNsAVucT3PcBbIuIFAElzyA43nVPLxMzMrPkVvdlwaG75lbVIxMzMWk+RkciXgLsl3UJ2me878SjEzMwodmL9akmLgLeTFZHpEfHHWidmZmbNr9BjTyJiHdn9GmZmZn/jyaXMzKw0FxEzMyutzyIiaSdJ99UrGTMzay19FpF0b8i9kvz8dDMze5EiJ9ZHAisl3Qn8pTsYEZNqlpWZmbWEIkXkX2qehZmZtaQi94ncKum1QHtE/Co9wt1zmZuZWaEHMH6EbG7zf0uhUcBPa5mUmZm1hiKX+J4OHAo8DRARq4G9apmUmZm1hiJF5NmIeK57JU1RW3EuczMzG1iKFJFbJX0O2E3Su4EfATfUNi0zM2sFRYrI2cAGYAXwf4Ebgc/3p1NJQyVdK+kBSfdLeoekPSUtlLQ6/RyW2krSpZI6JS2XdEDufaam9qslTe1PTmZmtuOKXJ31QpqI6g6yw1gPRkR/D2d9A/hlRJwgaQiwO/A54OaIuEjS2WTFazowEWhPr4OAy4GDJO1JNsVuR8prmaT5EbGpn7mZmVlBRa7Oeg/we+BS4FtAp6SJZTuUtAfZnCRXAETEcxHxJDAZmJOazQGOT8uTgasisxgYKmkkcCywMCI2psKxEJhQNi8zM9txRW42/BpwZER0Akh6HfBz4Bcl+9yH7PDY9yS9GVhGNof73umR80TEOkndV4CNAh7N7d+VYr3FX0TSNGAawNixfoKLmVm1FDknsr67gCRrgPX96HMwcABweUS8lexRKmf30V4VYtFH/MXBiFkR0RERHSNGjNjRfM3MrBe9jkQkvT8trpR0IzCP7I/0icCSfvTZBXRFxB1p/VqyIvK4pJFpFDKSrYWqCxiT2380sDbFj+gRX9SPvMzMbAf1NRJ5b3rtCjwOHE72R3sDMKxsh2lq3UclvT6FjgZWkc2c2H2F1VTg+rQ8H5iSrtI6GHgqHfZaABwjaVi6kuuYFDMzszrpdSQSEafUsN8zgR+mK7PWAKeQFbR5kk4DHiEb8UB2SfFxQCfw19SWiNgoaQZbR0UXRsTGGuZsZmY9bPfEuqRxZH/02/Lt+/Mo+Ii4h+zS3J6OrtA2yB69Uul9ZgOzy+ZhZmb9U+TqrJ+SXY57A/BCbdMxM7NWUqSIPBMRl9Y8EzMzazlFisg3JJ0P3AQ82x2MiLtqlpWZmbWEIkXkTcCHgaPYejgr0rqZmQ1gRYrI+4B98o+DNzMzg2J3rN8LDK11ImZm1nqKjET2Bh6QtIRtz4mUvsTXzMxeGooUkfNrnoWZmVU070cHNqTfD5x4Z6F2ReYTubXf2ZiZ2UtSkTvWN7P16bhDgJ2Bv0TEHrVMzMzMml+Rkcgr8uuSjgcaM74yM7OmUuTqrG1ExE/xPSJmZkaxw1nvz63uxNY5zc3MbIArcnXWe3PLW4CHyeY9NzOzAa7IOZFazitiZmYtrK/pcc/rY7+IiBk1yMfMzFpIXyfW/1LhBXAaML2/HUsaJOluST9L6+Mk3SFptaRr0qyHSNolrXem7W259zgnxR+UdGx/czIzsx3TaxGJiK91v4BZwG5kU9POBfapQt9nAffn1i8GLomIdmATWbEi/dwUEX8HXJLaIWk8cBLwRmACcJmkQVXIy8zMCurzEl9Je0r6ArCc7NDXARExPSLW96dTSaOB9wD/ntZFdtnwtanJHOD4tDw5rZO2H53aTwbmRsSzEfEQ2Rzsvn/FzKyOei0ikr4CLAE2A2+KiAsiYlOV+v1X4LNsnZ/kVcCTEbElrXcBo9LyKOBRgLT9qdT+b/EK+5iZWR30NRL5FPAa4PPAWklPp9dmSU+X7VDS3wPrI2JZPlyhaWxnW1/79OxzmqSlkpZu2LBhh/I1M7Pe9Xp1VkTs8N3sBR0KTJJ0HLArsAfZyGSopMFptDEaWJvadwFjgC5Jg4FXAhtz8W75fbYREbPIzuvQ0dHhGyXNzKqkVoWiVxFxTkSMjog2shPjv46IDwK3ACekZlOB69Py/LRO2v7riIgUPyldvTUOaAeKPbvYzMyqosgd6/UyHZibTuTfDVyR4lcA35fUSTYCOQkgIlZKmgesIruT/vSIeL7+aZuZDVwNLSIRsQhYlJbXUOHqqoh4Bjixl/1nAjNrl6GZmfWl7oezzMzspcNFxMzMSnMRMTOz0lxEzMysNBcRMzMrzUXEzMxKcxExM7PSXETMzKw0FxEzMyutmR57YmYDyP0zf133Pt9w7lF17/OlziMRMzMrzSMRa1m3vvPwhvR7+G239rrtW5+6oY6ZbHXG197bkH7NPBIxM7PSXETMzKw0FxEzMyvNRcTMzEpzETEzs9LqXkQkjZF0i6T7Ja2UdFaK7ylpoaTV6eewFJekSyV1Slou6YDce01N7VdLmtpbn2ZmVhuNGIlsAT4VEW8ADgZOlzQeOBu4OSLagZvTOsBEoD29pgGXQ1Z0gPOBg8im1T2/u/CYmVl91L2IRMS6iLgrLW8G7gdGAZOBOanZHOD4tDwZuCoyi4GhkkYCxwILI2JjRGwCFgIT6vhRzMwGvIaeE5HUBrwVuAPYOyLWQVZogL1Ss1HAo7ndulKst3ilfqZJWipp6YYNG6r5EczMBrSG3bEu6eXAj4FPRMTTknptWiEWfcRfHIyYBcwC6Ojo+Fubt33mqh1JuWqWfWVKQ/o1M6u2hoxEJO1MVkB+GBE/SeHH02Eq0s/1Kd4FjMntPhpY20fczMzqpBFXZwm4Arg/Ir6e2zQf6L7CaipwfS4+JV2ldTDwVDrctQA4RtKwdEL9mBQzM7M6acThrEOBDwMrJN2TYp8DLgLmSToNeAQ4MW27ETgO6AT+CpwCEBEbJc0AlqR2F0bExvp8BDMzgwYUkYj4TyqfzwA4ukL7AE7v5b1mA7Orl52Zme0I37FuZmaluYiYmVlpLiJmZlaai4iZmZXmImJmZqW5iJiZWWkuImZmVpqLiJmZleYiYmZmpbmImJlZaS4iZmZWmouImZmV5iJiZmaluYiYmVlpLiJmZlaai4iZmZXmImJmZqW1fBGRNEHSg5I6JZ3d6HzMzAaSli4ikgYB3wYmAuOBkyWNb2xWZmYDR0sXEeBAoDMi1kTEc8BcYHKDczIzGzAUEY3OoTRJJwATIuL/pPUPAwdFxBk92k0DpqXV1wMPVqH74cATVXifamvGvJxTMc6puGbM66We02sjYkTP4OAqvXmjqELsRVUxImYBs6rasbQ0Ijqq+Z7V0Ix5OadinFNxzZjXQM2p1Q9ndQFjcuujgbUNysXMbMBp9SKyBGiXNE7SEOAkYH6DczIzGzBa+nBWRGyRdAawABgEzI6IlXXqvqqHx6qoGfNyTsU4p+KaMa8BmVNLn1g3M7PGavXDWWZm1kAuImZmVpqLiJmZlTYgi4ikV0uaK+n3klZJulHSvpLuq3G/J0paKekFSR09tjUqp69IekDScknXSRraBDnNSPncI+kmSa/psb0heeX6/7SkkDS80TlJukDSY+l3dY+k4xqdU+r7zPRMu5WSvtzonCRdk/sdPSzpnibI6S2SFqeclko6sMf2RuX1Zkm3S1oh6QZJe/S5Q0QMqBfZDYq3Ax/Nxd4CHAbcV+O+30B2x/wioKNJcjoGGJyWLwYuboKc9sgtfxz4TjP8rlJfY8iuBvwDMLzROQEXAJ+uEG9kTkcCvwJ2Set7NTqnHvl9DTiv0TkBNwET0/JxwKIm+e+3BDg8LZ8KzOir/UAciRwJ/E9EfKc7EBH3AI92r0tqk/QbSXel1yEpPlLSbembw32SDpM0SNKVaX2FpE/21nFE3B8RlR650sicboqILWl1MdkNm43O6enc6svY9ikEDcsruQT4bJPlVEkjc/oYcFFEPJv6Xd8EOXW/v4APAFc3QU4BdH/LfyXb3ijdyLxeD9yWlhcC/9BH29a+T6Sk/YBl22mzHnh3RDwjqZ3sH1wH8I/AgoiYqewJwruTfTsYFRH7ASh3OKgFczoVuKYZcpI0E5gCPEX2P1S3huUlaRLwWETcm/0tanxOyRmSpgBLgU9FxKYG57QvcFj6b/gM2UhpSYNz6nYY8HhErE7rjczpE8ACSV8lO7VwSG5bI/O6D5gEXA+cyLZPBXmRgTgSKWJn4LuSVgA/InvMPGTDvFMkXQC8KSI2A2uAfSR9U9IE4OlKb9jsOUk6F9gC/LAZcoqIcyNiTMrnjL7a1iMvSbsD5wLn7WAuNcspuRx4HdkfiXVkh2oandNgYBhwMPAZYJ56VN0G5NTtZLaOQoqqVU4fAz6Z/p1/EriiSfI6FThd0jLgFcBzfWZRy2NrzfgCjgZuqxBvIx1rJDvO3P3tYDCwJdfuNcBHgBXAlBR7OdmQ7wayu+a3l8Mitj0n0tCcgKlkx193b5accu/zWnLHgBuVF/Amsm9+D6fXFuAR4NVN9LvK99ewnIBfAkfk1n8PjGj07ym93+PA6Eb/e0rtnmLrDd8Cnm6GvHr0ty9wZ19tBuJI5NfALpI+0h2Q9HayP1bdXgmsi4gXgA+TPVIFSa8F1kfEd8m+NRyg7AqdnSLix8A/Awe0Uk7pW8l0YFJE/LVJcmrPrU4CHmh0XhGxIiL2ioi2iGgje/jnARHxxwb/rkbmVt9HdiiiYb+n5KfAUem99gWGkD2OvNH/770LeCAiunKxRua0Fjg8LR8FrM5ta+S/qb3Sz52AzwPf6a0tMPBGIrkqPY/sG9JK4OdAO1srfDuwnOxE85eAP6f4VLL/Se8GfgOMA94M3AXck14T++j3fWR/fJ4l+0a0oAly6iQ7Wdfd9jtNkNOP0/7Lyb41jWqG/349cniYdHVWg39X3yf7trmc7OGjI5sgpyHAD9J73AUc1eic0ntcSe5qp0bnBPwvsvMe9wJ3AG9rkrzOAv4rvS4ijZZ6e/nZWWZmVtpAPJxlZmZVMhAv8a05Sd8GDu0R/kZEfK8R+YBz2hHNmJdzKsY5FVetvHw4y8zMSvPhLDMzK81FxMzMSnMRMasiSUMl/VMd+jlC6VlJZo3kImJWXUOBwkVEmTL/Hx7Bts9aMmsIn1g3qyJJc4HJwIPALcD+ZM+R2hn4fERcL6kN+EXa/g7geLK7qaeT3cW8Gng2Is6QNILsjuGxqYtPAI+R3WD2PLABODMiflOPz2fWk4uIWRWlAvGziNhP0mCy55E9nR47sZjsLuPXkj0Q75CIWKxs0q3fkT2KYjPZIy/uTUXkP4DLIuI/JY0le8rBG9LD9f4cEV+t92c0y/N9Ima1I+CLkt4JvACMAvZO2/4QEYvT8oHArRGxEUDSj8gefAfZCGV87iG4e0h6RT2SNyvCRcSsdj5I9vTat0XE/0h6GNg1bftLrl1fj0nfCXhHRPx3Plj8yepmteUT62bVtZlsDgbInrK6PhWQI9n2Cax5dwKHSxqWDoHlZ5K7idx8KpLeUqEfs4ZxETGrooj4E/BbSfeRTRTVIWkp2ajkgV72eQz4ItmTXH8FrCKbawKyOeY7JC2XtAr4aIrfALxP2RSoh9XsA5lth0+smzUBSS+PiD+nkch1ZJMGXdfovMy2xyMRs+ZwgaR7yOaBeIhsYiezpueRiJmZleaRiJmZleYiYmZmpbmImJlZaS4iZmZWmouImZmV9v8BbGhWhyp/vmMAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Target 分布，看看各类样本分布是否均衡\n",
    "sns.countplot(train.target);\n",
    "plt.xlabel('target');\n",
    "plt.ylabel('Number of occurrences');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "各类样本不太均衡。交叉验证对分类任务缺省的是采用StratifiedKFold，在每折采样时根据各类样本按比例采样"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 各特征的分布\n",
    "猜测不同特征的分布差不太多，所以只观察一个特征feat_1的分布"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAEHCAYAAABr66s0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3de9xUZb338c8X8KwI4uEhINEd9mgnNVLLNNNStLaoO9u2LUkt0tQ8VDvNktJOZmXZQR+fJLHaklkpqYVkHjqpoCKCqNyaCYGi4AF0pxv97T+ua2CxmLnvhYu5ceL7fr3mNbOu37rWXHPNWus36zBrKSIwMzOro8/aboCZmXU+JxMzM6vNycTMzGpzMjEzs9qcTMzMrLZ+a7sBvW3LLbeM4cOHr+1mmJl1jDvuuOOJiNiqu3HWuWQyfPhwpk2btrabYWbWMST9radxvJvLzMxqczIxM7PanEzMzKw2JxMzM6vNycTMzGpzMjEzs9qcTMzMrDYnEzMzq83JxMzMalvn/gG/7PHFPH7hT5rGtjr+g73cGjOzfw7eMjEzs9qcTMzMrLa2JhNJD0u6R9J0SdNy2RaSpkiak58H5nJJukBSl6QZknYtTGdMHn+OpDGF8jfn6Xflumrn5zEzs+Z6Y8vknRGxc0SMzMOnAzdExAjghjwMcCAwIj/GAhdCSj7AOGB3YDdgXCMB5XHGFuqNav/HMTOzsrWxm2s0MCG/ngAcUii/LJJbgQGSBgMHAFMiYnFEPAlMAUblWP+I+EtEBHBZYVpmZtaL2p1MArhe0h2SxuaybSJiAUB+3jqXDwHmFurOy2Xdlc9rUr4KSWMlTZM0bdHSZ2p+JDMzK2v3qcF7RsR8SVsDUyTd1824zY53xMsoX7Uw4mLgYoCdt92+6ThmZvbytXXLJCLm5+eFwK9Ixzwey7uoyM8L8+jzgGGF6kOB+T2UD21SbmZmvaxtyUTSJpI2a7wG9gdmApOAxhlZY4Cr8+tJwFH5rK49gKfzbrDJwP6SBuYD7/sDk3NsiaQ98llcRxWmZWZmvaidu7m2AX6Vz9btB/xXRPxW0lTgCknHAo8Ah+fxrwMOArqA54CjASJisaRzgKl5vLMjYnF+fTxwKbAR8Jv8MDOzXta2ZBIRDwFvalK+CNivSXkAJ7SY1nhgfJPyacDrazfWzMxq8T/gzcysNicTMzOrzcnEzMxqczIxM7PanEzMzKw2JxMzM6vNycTMzGpzMjEzs9qcTMzMrDYnEzMzq83JxMzManMyMTOz2pxMzMysNicTMzOrzcnEzMxqczIxM7PanEzMzKw2JxMzM6vNycTMzGpzMjEzs9qcTMzMrDYnEzMzq83JxMzManMyMTOz2pxMzMysNicTMzOrzcnEzMxqczIxM7PanEzMzKy2HpOJpE0k9cmvd5B0sKT12t80MzPrFFW2TG4BNpQ0BLgBOBq4tJ2NMjOzzlIlmSgingMOA74bEYcCO1V9A0l9Jd0l6Zo8vJ2k2yTNkfQzSevn8g3ycFeODy9M44xcfr+kAwrlo3JZl6TTq7bJzMzWrErJRNJbgSOBa3NZv9V4j5OB2YXhc4HzI2IE8CRwbC4/FngyIl4DnJ/HQ9JOwBHA64BRwA9yguoLfB84kJTcPpDHNTOzXlYlmZwCnAH8KiJmSdoeuLHKxCUNBd4D/DAPC9gXuDKPMgE4JL8enYfJ8f3y+KOBiRHxfET8FegCdsuProh4KCJeACbmcc3MrJf1uIURETcDN0vaJA8/BHyi4vS/DfwnsFkeHgQ8FRHL8vA8YEh+PQSYm99jmaSn8/hDgFsL0yzWmVsq371ZIySNBcYCDN1iUMWmm5lZVVXO5nqrpHvJu6okvUnSDyrUey+wMCLuKBY3GTV6iK1u+aqFERdHxMiIGDlo0/7dtNrMzF6OKsc+vg0cAEwCiIi7Je1dod6ewMGSDgI2BPrnaQ2Q1C9vnQwF5ufx5wHDgHmS+gGbA4sL5Q3FOq3KzcysF1X602JEzC0VvVihzhkRMTQihpMOoP8+Io4kHW95Xx5tDHB1fj0pD5Pjv4+IyOVH5LO9tgNGALcDU4ER+eyw9fN7TKryeczMbM2qsmUyV9LbgMgr7U+w8tlZq+szwERJXwLuAi7J5ZcAP5bURdoiOQIgH/S/ArgXWAacEBEvAkg6EZgM9AXGR8SsGu0yM7OXSenHfzcjSFsC3wHeRTpOcT1wckQsan/z1rydt90+ppx+dtPYVsd/sJdbY2b2yifpjogY2d04Vc7meoL0HxMzM7OmqpzNNUHSgMLwQEnj29ssMzPrJFUOwL8xIp5qDETEk8Au7WuSmZl1mirJpI+kgY0BSVuwepdTMTOzf3JVksI3gT9LalwC5XDgy+1rkpmZdZoqB+Avk3QH8E7S2VyHRcS9bW+ZmZl1jKq7q+4jXeG3H4CkV0fEI21rlZmZdZQek4mkk4BxwGOkf76LdA2sN7a3aWZm1imqbJmcDLy2U/+kaGZm7VflbK65wNPtboiZmXWuKlsmDwE3SboWeL5RGBHfalurzMyso1RJJo/kx/r5YWZmtpIqpwZ/EUDSJhHxbPubZGZmnaZtd1o0M7N1R5UD8I07LS6CdKdFoMqdFs3MbB3RtjstmpnZumNt3GnRzMz+yVTZMjkOOAEYAswDds7DZmZmQA9bJpL6Ah+KCN9p0czMWup2yyQiXgRG91JbzMysQ1U5ZvInSd8DfgYs/59JRNzZtlaZmVlHqZJM3pafzy6UBbDvmm+OmZl1op6OmfQBLoyIK3qpPWZm1oF6OmbyEnBiL7XFzMw6VJVTg6dI+pSkYZK2aDza3jIzM+sYVY6ZHJOfi/8tCWD7Nd8cMzPrRFWuGrxdbzTEzMw6V5V7wB/VrDwiLlvzzTEzs05UZTfXWwqvNwT2A+4EnEzMzAyotpvrpOKwpM2BH7etRWZm1nEqXYK+5DlgxJpuiJmZda4qd1r8taRJ+XENcD9wdYV6G0q6XdLdkmZJatz+dztJt0maI+ln+bL2SNogD3fl+PDCtM7I5fdLOqBQPiqXdUk6ffU/vpmZrQlVjpl8o/B6GfC3iJhXod7zwL4RsVTSesAfJf0GOA04PyImSroIOBa4MD8/GRGvkXQEcC7w75J2Ao4AXge8CvidpB3ye3wfeDfp0vhTJU2KiHsrtM3MzNagKru5HgFui4ibI+JPwKLiVkMrkSzNg+vlR+OaXlfm8gnAIfn16DxMju8nSbl8YkQ8HxF/BbqA3fKjKyIeiogXgIn4CsdmZmtFlWTyc+ClwvCLuaxHkvpKmg4sBKYADwJPRcSyPMo80k23yM9zAXL8aWBQsbxUp1V5s3aMlTRN0rRFS5+p0nQzM1sNVZJJv/zLH4D8ev0qE4+IFyNiZ2AoaUtix2aj5We1iK1uebN2XBwRIyNi5KBN+/fccDMzWy1Vksnjkg5uDEgaDTyxOm8SEU8BNwF7AAMkNY7VDAXm59fzgGH5PfoBmwOLi+WlOq3Kzcysl1W9B/xnJT0i6RHgM8DHeqokaStJA/LrjYB3AbOBG4H35dHGsOLMsEl5mBz/fURELj8in+21Hem05NuBqcCIfHbY+qSD9JMqfB4zM1vDqvxp8UFgD0mbAoqIJRWnPRiYkO8j3we4IiKukXQvMFHSl4C7gEvy+JcAP5bURdoiOSK//yxJVwD3ks4mOyHfThhJJwKTgb7A+IiYVbFtZma2BlW5NtdXgK/nXVVIGgh8MiI+1129iJgB7NKk/CHS8ZNy+T+Aw1tM68vAl5uUXwdc19NnMDOz9qqym+vARiIBiIgngYPa1yQzM+s0VZJJX0kbNAby8Y8NuhnfzMzWMVX+Af8T4AZJPyKdensMK/5caGZmVukA/NclzSCdjQVwTkRMbm+zzMysk1TZMoF01lXjcih3ta85ZmbWiapcNfj9pP91vA94P3CbpPd1X8vMzNYlVbZMzgTeEhELIf0ZEfgdKy7WaGZm67gqZ3P1aSSSbFHFemZmto6osmXyW0mTgcvz8L/jPwqamVlBlbO5Pi3pMODtpCv1XhwRv2p7y8zMrGNUOpsrIn4J/LLNbTEzsw7lYx9mZlabk4mZmdXWMplIuiE/n9t7zTEzs07U3TGTwZLeARwsaSKl2+RGxJ1tbZmZmXWM7pLJWcDppNvhfqsUC2DfdjXKzMw6S8tkEhFXAldK+nxEnNOLbTIzsw5T5X8m50g6GNg7F90UEde0t1lmZtZJqlzo8avAyaR7sN8LnJzLzMzMgGp/WnwPsHNEvAQgaQLpMvRntLNhZmbWOar+z2RA4fXm7WiImZl1ripbJl8F7pJ0I+n04L3xVomZmRVUOQB/uaSbgLeQkslnIuLRdjfMzMw6R9ULPS4AJrW5LWZm1qF8bS4zM6vNycTMzGrrNplI6iNpZm81xszMOlO3yST/t+RuSa/upfaYmVkHqnIAfjAwS9LtwLONwog4uG2tMjOzjlIlmXyx7a0wM7OOVuV/JjdL2hYYERG/k7Qx0Lf9TTMzs05R5UKPHwWuBP5fLhoCXFWh3jBJN0qaLWmWpJNz+RaSpkiak58H5nJJukBSl6QZknYtTGtMHn+OpDGF8jdLuifXuUCSVm2JmZm1W5VTg08A9gSeAYiIOcDWFeotAz4ZETsCewAnSNqJdMOtGyJiBHBDHgY4EBiRH2OBCyElH2AcsDuwGzCukYDyOGML9UZVaJeZma1hVZLJ8xHxQmNAUj/SnRa7FRELGrf2jYglwGzSVs1oYEIebQJwSH49GrgskluBAZIGAwcAUyJicUQ8CUwBRuVY/4j4S0QEcFlhWmZm1ouqJJObJX0W2EjSu4GfA79enTeRNBzYBbgN2CZfnqVxmZbGVs4QYG6h2rxc1l35vCblzd5/rKRpkqYtWvrM6jTdzMwqqJJMTgceB+4BPgZcB3yu6htI2hT4BXBKRHS3Jm92vCNeRvmqhREXR8TIiBg5aNP+PTXZzMxWU5WzuV7KN8S6jbSyvj/vVuqRpPVIieSnEfHLXPyYpMERsSDvqlqYy+cBwwrVhwLzc/k+pfKbcvnQJuObmVkvq3I213uAB4ELgO8BXZIOrFBPwCXA7Ij4ViE0CWickTUGuLpQflQ+q2sP4Om8G2wysL+kgfnA+/7A5BxbImmP/F5HFaZlZma9qMqfFr8JvDMiugAk/QtwLfCbHurtCXwIuEfS9Fz2WeBrwBWSjgUeAQ7PseuAg4Au4DngaICIWCzpHGBqHu/siFicXx8PXApslNvTU5vMzKwNqiSThY1Ekj3Eil1TLUXEH2l+XANgvybjB+k05GbTGg+Mb1I+DXh9T20xM7P2aplMJB2WX86SdB1wBemYyeGs2EowMzPrdsvkXwuvHwPekV8/DgxcdXQzM1tXtUwmEXF0bzbEzMw6V4/HTCRtB5wEDC+O70vQm5lZQ5UD8FeRTvH9NfBSe5tjZmadqEoy+UdEXND2lpiZWceqkky+I2kccD3wfKOwcRFHMzOzKsnkDaQ/H+7Lit1ckYfNzMwqJZNDge2Ll6E3MzMrqnLV4LuBAe1uiJmZda4qWybbAPdJmsrKx0x8arCZmQHVksm4trfCzMw6WpX7mdzcGw0xM7POVeUf8EtYcQfD9YH1gGcjwrcsNDMzoNqWyWbFYUmHALu1rUVmZtZxqpzNtZKIuAr/x8TMzAqq7OY6rDDYBxjJit1eZmZmlc7mKt7XZBnwMDC6La0xM7OOVOWYie9rYmZm3erutr1ndVMvIuKcNrTHzMw6UHdbJs82KdsEOBYYBDiZmJkZ0P1te7/ZeC1pM+Bk4GhgIvDNVvXMzGzd0+0xE0lbAKcBRwITgF0j4sneaJiZmXWO7o6ZnAccBlwMvCEilvZaq8zMrKN096fFTwKvAj4HzJf0TH4skfRM7zTPzMw6QXfHTFb73/FmZrZucsIwM7PanEzMzKw2JxMzM6vNycTMzGpzMjEzs9ralkwkjZe0UNLMQtkWkqZImpOfB+ZySbpAUpekGZJ2LdQZk8efI2lMofzNku7JdS6QpHZ9FjMz6147t0wuBUaVyk4HboiIEcANeRjgQGBEfowFLoTl/8AfB+xOurvjuEYCyuOMLdQrv5eZmfWStiWTiLgFWFwqHk26LAv5+ZBC+WWR3AoMkDQYOACYEhGL82VcpgCjcqx/RPwlIgK4rDAtMzPrZb19zGSbiFgAkJ+3zuVDgLmF8eblsu7K5zUpNzOzteCVcgC+2fGOeBnlzScujZU0TdK0RUt9JRgzszWtt5PJY3kXFfl5YS6fBwwrjDcUmN9D+dAm5U1FxMURMTIiRg7atH/tD2FmZivr7WQyCWickTUGuLpQflQ+q2sP4Om8G2wysL+kgfnA+/7A5BxbImmPfBbXUYVpmZlZL+vxHvAvl6TLgX2ALSXNI52V9TXgCknHAo8Ah+fRrwMOArqA50g34SIiFks6B5iaxzs7IhoH9Y8nnTG2EfCb/DAzs7WgbckkIj7QIrRfk3EDOKHFdMYD45uUTwNeX6eNZma2ZrxSDsCbmVkHczIxM7PanEzMzKw2JxMzM6vNycTMzGpzMjEzs9qcTMzMrDYnEzMzq83JxMzManMyMTOz2pxMzMystrZdm6tTPX7RRS1jWx13XC+2xMysc3jLxMzManMyMTOz2pxMzMysNicTMzOrzcnEzMxqczIxM7PanEzMzKw2JxMzM6vNycTMzGpzMjEzs9qcTMzMrDYnEzMzq83JxMzManMyMTOz2pxMzMysNicTMzOrzTfHWk2PXfiVlrFtjv9sL7bEzOyVw1smZmZWm5OJmZnV5t1ca9jfv39Cy9iQE77fiy0xM+s9Hb9lImmUpPsldUk6fW23x8xsXdTRWyaS+gLfB94NzAOmSpoUEfeu3Za1Nud7o1vGRpx4NXde9K8t47se9+t2NMnMrLaOTibAbkBXRDwEIGkiMBp4xSaTuv7w/9/bMrbXR69h8iUHtYwfcOx17WiSmRmKiLXdhpdN0vuAURHxkTz8IWD3iDixNN5YYGwefC1wfyG8JfBEN29TJ97OaTvuuOPrbry333vbiNiqm/EhIjr2ARwO/LAw/CHgu6s5jWntirdz2o477vi6G1/bbWv26PQD8POAYYXhocD8tdQWM7N1Vqcnk6nACEnbSVofOAKYtJbbZGa2zunoA/ARsUzSicBkoC8wPiJmreZkLm5jvJ3Tdtxxx9fd+Npu2yo6+gC8mZm9MnT6bi4zM3sFcDIxM7P6Vvf0r3+WBzCK9H+TLuD0Umw8sBCY2aLuMOBGYDYwCzi5FN8QuB24O8e/2GI6fYG7gGuaxB4G7gGm0+Q0PWAAcCVwX27HWwux1+Z6jcczwCml+qfmts0ELgc2LMVPzrFZwCnN+gTYApgCzAH+Djxeih+e678EXN2k/nm5/TPy5y3XPyfHpufpP9HsOwE+BUST+l/I9aYDi4CnyvWBk/J88CTwbKn+zwp9uAT4n1L8TcBf8md4Ik9n+fxQ6J+/AoubxIv9M608PxX6Z3b+bOX6jf6Zlac/h+bz4zm5fx4o1W/0z6z8+R4p18/982Dum0Wl+o3+mQX8N/CPUnxn4NZc9kzuh2K80X8zc//fQ2F5AbYDbiMto4sKn7URPzHHAriT0vIG/LTQZ4+X44XP+APgxSb1/5A/393AC8DTpfi++X1nAhOA9Sksz4X2z8l9tWEp3mjfTNLytUEpfkl+7xmkZb1/Kd5o33TSWaxXleL75fZNz9/N/ZTWJ6yY/2fn5+XrE1aeP0f2uE5d2yv1tfEgrcQfBLbPM8DdwE6F+N7ArrROJoOBXfPrzUgLabG+gE3z6/XyDLVHk+mcBvwXrZPJlt18hgnAR/Lr9YEB3XzWR0l/OmqUDSEt2Bvl4SuADxfir88z+MakkzR+B/xHuU+Ar5MTMXARcGkpviMpsd0EfLRJ/f2BfoUFq1y/f+H1d/ICVU4Gw0gnYDxKWrjLyeRTrb5T4J35s22Q4/t1851PBC4s1Z8KvCPPD18grbSXzw+N/snxC4BzS/FG//wZOLI8PzX6J9e/tEn9/oX58ev5O1hpfmTFD58FpD+iFet/gZSIm87Phf7ZNvfd1uXpF97/x8BZpfrXAwfm+El5PijGG/0n4Pjcf8uXF9J8eUSO/zCPU4zvAgwnLSvblpc34KBcV8DPy/Xz+CNz25e2Wl5z/auAowrxtwFzgR3yOGeT5pHly3Oj/YXl48pSvNi+y5vEi/P/t4BraL2++EWOFes/AOyYXy8CLi/VKc7/E4BTi+sTVl5+e0wm6+puruWXYYmIF0gzwfKLZkXELaRfek1FxIKIuDO/XkLK5EMK8YiIpXlwvfxY6UwHSUOB95AWktUiqT9p5XdJfr8XIuKpFqPvBzwYEX8rlfcDNpLUj5Q0iv/P2RG4NSKei4hlwM2k//CU+2Q0aSYE+CKwVzEYEbMjonG1gbvK9SPi+jx9SAvSgFL8mcLgo6RfV2XnA/+ZY082iTem1ew7PR74WkQ8n+MPNqsrScCepF+XRa8FbomIBaTv4t9K88NoYEKOfxU4pBgv9M8L5KsylOLXR8SyXP9qYGgp/kyusyB/9mgyP55P2rJ8vjz9Qt+0mp8b/fO3iLgzIhY2q0/6bvYhrayK8SCtEBeQttzml+KN/gvSivLfWHl52Re4MscvAQ4pxiPiroh4OLfh2fxcjF+Xl8UgJeyhxXi+tt95pPmHcv1C2aakpHdVIf4i8HxEPJDHmQ68i7w853lmX9J8DXAdaeW9fHkvte9+4M2l+DOFaW0J/F+arC8kbUZazl9VigdpawbSIY1HS1WPB77Gih9T387v+0JEPFVafnu0riaTIaRfFQ3zWHnhqEzScNIvpNtK5X0lTSft2pkSEbeVqn6bNBO/1GLSAVwv6Y58OZii7Umb7T+SdJekH0rapMV0jiD96lkx4Yi/A98g7dZYADwdEdcXRpkJ7C1pkKSNSb+gin8ObdgmrygaK7QtW7ShimNIv4BWIunLkuYCR5J+nRVjBwN/j4i7u5nuiZJmSBrPigWrYQdgL0m3SboZeGOLaewFPEb6BVw0Ezg4vz4cGFaaH8r9s3Wr+aXwmVrFjwF+U46X+uesYrxZ/zSZ/vL+kTSwFF+pfyS9pUX79gIei4g5pfgpwHm5fd8AzijFi/33flJyWUjaNfgg8FThx8Z80gqv1fLUp9XyJmk90tUx3l+KnwhManxH3SyvhwI3ALcU2nc7sJ6kkXmcc0kJvbE8Dyq1/2Ol+HK5fScDXyrHJf2IlARGAWOa1c/tWwp8shT/CHCdpHnAJsA7S+uTHUjf3Z9yeydVWJ+0tK4mEzUpW+1zpCVtStq8PKX0K5qIeDEidib9GtpN0usL9d4LLIyIO7qZ/J4RsStpN8EJkvYuxPqRdjtcGBG7kH6VrXL5/fxHzoNJm/jF8oGkX83bkX7NbCLpg4W2zyYtHFOA35J2Ay6jTSSdmad/VTkWEWdGxDDSbrCjCnU2Bs4k7Vpp5ULgX0j77hcAnyvF+wEDSbtEPk26AnUzH6CUkLNjSN/NHaTdNy/QYn4oaBlvNT+V+meleKl/Tm3E8/jl/tmkVL/cP98pxcv98/MW7f8AcHmT9h9P2nUyLLft0lK82H+bko5pDSXtOdix1D0vkY6PrLI8NeKtljfSMZFbImJEIb436QfAdxsjdVP/A6StruVx4HWkH2rnS3qAdDxlaaHO8nVMXt4XkY4rNfNr4JGIuKQciIijSbuIF5JW/s18grT7tbw+ORU4KCKGkpbnaay8Pml8v0eT9k68mbReabo+6VF5v9e68CAdXJpcGD4DOKM0znBa7D+PFftWJwOnVXi/ceR993n4q6StoYdJvzqeA37STf0vlOr/H+DhwvBewLVN6o0Grm9SfjhwSWH4KOAH3bz/V4CPl/uEtGk+OL8eTPo12ewA+U2kfdOr9Cnp19Zf8szcss9J++3vb8SBN5AWsIfzYxnpYPJ9LeoPL9bPZb8F9ikM/w2YXarXj7RVMrSH9u1EWqGcVigr9s8w0kK6yvyS+2f3ZvNToX/6dze/kZLCkka8Rf/8NzCuRf3XFOuX+4c0vz8HfL5F/wwvty/3hwr1l3XT/h2A2wvLy6dJu8Yax9SWL7Osujw9TOH4YjGeX18F9CnFx5GWvUb/vETa9V2uP4iUCDZsNv3C8vx4/n4by/NPG+3P8YW5/1da3vO0HqCb9UGh/nNN6g/K0y3Xv5a0e7sxjVcD9xbXJ43vl7w+IS2/W1Fan1DxmMlaXamvrUf+gh8i/TJvHIB/XWmc4bRecQi4DPh2i/hW5APiwEaksy7e22LcfSgdUCP9gtys8PrPpKsjF8f5A/DawsxxXpNpTwSOblK+O+ksjY3zZ5kAnFQaZ+vCTHgf6RfMSn1C2t/cOAB/OukgY+VkQtp0vxfYqlmfAyMKr08i7Xdu9Z08TNp9Uqw/uPD6VNIvwGL8OODs/HoH0q6UcrIbBdzcon2NPuqTF8TrS3XPy/0i0lk1TS+el/vn2vL8VOyfZvNbo3/y9G8H5nQzvy4FLiqVDy7EpwL3l+LHkQ4sC/gVKdmoWf+0aN9s0vwt0sH4x1rMY1uTtvyOobC8kLaEjsiffzzpB80qyxNpd+325eWNtJvnz6RE3nJ5zNNf2iye+2Bis/qF9m9A2g22L4XludH+/Pqi3P5ivNG+jcrrg9xnryl8P9/Ij+X1C+2b0KR+P1Iy24G0Dvk4aatw+fqElef/aaStU1Fan+Bk0sMHT8cBHiCtBM4sxS7PHfs/pIx/bCn+dtJuscZpq9NJm5ON+BtJB5xnkPYLn9VNO1aaOXLZ9qQE1zhV8cwm9XbOM8AM0i+vgaX4xqRfVJu3eN8vkpLETNLZLBuU4n8grcjuJh3cW6VPSL+KbiCd+vhofhTjh+bXz5MOkD9fineRjl1NZ8WpucX4L3L7ZpC2Oh7r5jtZ2uT9f0w63XRGLnusFF8f+El+j8W5v1aaPmnXzHEtPv/JeR56pNn8UOifuTk+qxRv9M8LOb6kFG/0z5wcX1SKN/qncXrsvTSfH9/e4v0b/dO0fqF/HsrxB8vTz/1zXovP/3bgjsL055Tijf57OH83Ky0vpOXg9ty/jVOHi/FP5P5blr+XxaX4stkPRlYAAAIdSURBVNzm+1jxq36V5ZG0vL5Yfv/CivR4mizP+XM3Tqk9pbw8F9rfRUosG5TijfY1+uQsViSDPqRjGY3P/FPS1uny+oX2jSoMF6d/aK4/m7R8NE49PzPHi/N/43MsX5+w8vL7GIW9Oc0evpyKmZnVtq4egDczszXIycTMzGpzMjEzs9qcTMzMrDYnEzMzq83JxMzManMyMVvDJH1C0mxJP13NesMl/UcP4wySdKOkpZK+V6+lZmuOk4nZmvdx0p/6jlzNesNJl/rvzj+Az5Muh2H2iuFkYrYGSbqI9M/nSZLOzFfjnZqvxjo6jzNc0h8k3Zkfb8vVv0a6Su90Sac2m35EPBsRf6T55fjN1hr/A95sDZP0MOlaZKeRLq73E0kDSJfW2IV0aZGXIuIfkkaQrkg7UtI+pAsIvrfCe3yYdL2kE9v0McxWS7+13QCzf2L7AwdLauyS2pB04cz5wPck7Uy6JlSrS4ubdQwnE7P2EenuiyvdrU7SF0gXznsTaVezd1lZx/MxE7P2mQyclG+7iqRdcvnmwIKIeIl0B8C+uXwJ6SZbZh3HycSsfc4h3RRqhqSZeRjSnf/GSLqVtIurcf/yGcAySXe3OgAPy4/JfAv4sKR5knZq1wcwq8oH4M3MrDZvmZiZWW0+AG/2CiTpAODcUvFfI+LQtdEes554N5eZmdXm3VxmZlabk4mZmdXmZGJmZrU5mZiZWW3/C+EbwmJ2P8zgAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.countplot(train.feat_1);\n",
    "plt.xlabel('feat_1');\n",
    "plt.ylabel('Number of occurrences');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "特征值大部分是0（稀疏），是长尾分布，可以考虑log(x+1)变换，减弱长尾中大特征值的影响\n",
    "特征稀疏（90%的数据为0）\n",
    "看起来这些特征和计数有关系-->特征工程也可考虑TF-IDF"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征之间的相关系数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#get the names of all the columns\n",
    "cols = train.columns \n",
    "\n",
    "# Calculates pearson co-efficient for all combinations，通常认为相关系数大于0.5的为强相关\n",
    "feat_corr = train.corr().abs()\n",
    "\n",
    "plt.subplots(figsize=(13, 9))\n",
    "sns.heatmap(feat_corr,annot=True)\n",
    "\n",
    "# Mask unimportant features\n",
    "sns.heatmap(feat_corr, mask=feat_corr < 1, cbar=False)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#Set the threshold to select only highly correlated attributes\n",
    "threshold = 0.5\n",
    "# List of pairs along with correlation above threshold\n",
    "corr_list = []\n",
    "#size = data.shape[1]\n",
    "size = feat_corr.shape[0]\n",
    "\n",
    "#Search for the highly correlated pairs\n",
    "for i in range(0, size): #for 'size' features\n",
    "    for j in range(i+1,size): #avoid repetition\n",
    "        if (feat_corr.iloc[i,j] >= threshold and feat_corr.iloc[i,j] < 1) or (feat_corr.iloc[i,j] < 0 and feat_corr.iloc[i,j] <= -threshold):\n",
    "            corr_list.append([feat_corr.iloc[i,j],i,j]) #store correlation and columns index\n",
    "\n",
    "#Sort to show higher ones first            \n",
    "s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))\n",
    "\n",
    "#Print correlations and column names\n",
    "for v,i,j in s_corr_list:\n",
    "    print (\"%s and %s = %.2f\" % (cols[i],cols[j],v))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "特征之间相关性很高，一定要加正则，也可以考虑对特征进行降维（PCA/t-SNE）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征工程"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "特征变换，这个是体力活\n",
    "1. 取对数log1p（对线性模型很重要，单调变换树模型影响不大）\n",
    "2. tf-idf\n",
    "3. 原始特征组合（加减乘除。如果是计数特征，乘法表示“and”，更有意义（FM）；或者可采用GBDT做特征编码，实现更高阶特征组合；原始特征维数太高，也可以先用基础模型得到特征的重要性，对重要的特征再组合）\n",
    "4. t-SNE及PCA降维后的特征 （降维部分讲解）\n",
    "5. 统计特征，如sum of the row, number of non-zero, max of the row，x-mean，个人感觉对这个数据集意义不大"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 分开特征和标签"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 标签\n",
    "y_train = train['target']   #形式为Class_x\n",
    "\n",
    "#暂存id，其实id没什么用\n",
    "train_id = train['id']\n",
    "# drop ids and get labels\n",
    "X_train = train.drop([\"id\", \"target\"], axis=1)\n",
    "\n",
    "#保存特征名字\n",
    "columns_org = X_train.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. feat编码：log(x+1)\n",
    "原始特征feat_x看起来像计数特征，取log运算更接近人对数字的敏感度，更适合线性模型。\n",
    "同时也可以降低长维分布中大数值的影响，减弱长维分布的长尾性。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_log = np.log1p(X_train)\n",
    "\n",
    "#重新组成DataFrame\n",
    "feat_names = columns_org + \"_log\"\n",
    "X_train_log = pd.DataFrame(columns = feat_names, data = X_train_log.values)\n",
    "\n",
    "X_train_log.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. feat编码：TF-IDF\n",
    "原始特征feat_x看起来像计数特征，类似文本分析中词频特征的处理，TF-IDF可以突出对特别类别有贡献的低频词。\n",
    "这里原始特征已经是计数特征了，直接调用TfidfTransformer，将计数特征变成TF-IDF\n",
    "如果输入是原始文本，需要将计数功能（TF）和IDF功能集中在一起，用TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# transform counts to TFIDF features\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "tfidf = TfidfTransformer()\n",
    "\n",
    "#输出稀疏矩阵\n",
    "X_train_tfidf = tfidf.fit_transform(X_train).toarray()\n",
    "\n",
    "#重新组成DataFrame,为了可视化\n",
    "feat_names = columns_org + \"_tfidf\"\n",
    "X_train_tfidf = pd.DataFrame(columns = feat_names, data = X_train_tfidf)\n",
    "\n",
    "X_train_tfidf.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. 其他特征工程\n",
    "5. 一行的最大值、和、非0元素数目\n",
    "将这些特征加到原始特征中"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#X_train['feat_max'] = X_train.max(axis=1)\n",
    "#X_train['feat_sum'] = X_train.sum(axis=1)\n",
    "#X_train['feat_zero_count'] = X_train.apply(lambda x : x.value_counts().get(0,0),axis=1)\n",
    "#X_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据预处理\n",
    "由于数据极度稀疏，数据缩放应采用MinMaxScaler，使得变换后的数据继续保持稀疏。\n",
    "如果将特征看似词频这种特征，也可以不用缩放，每个样本用模长归一"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对原始数据缩放\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "# 构造输入特征的标准化器\n",
    "ms_org = MinMaxScaler()\n",
    "\n",
    "#保存特征名字，用于结果保存为csv\n",
    "feat_names_org = X_train.columns\n",
    "\n",
    "# 用训练训练模型（得到均值和标准差）：fit\n",
    "# 并对训练数据进行特征缩放：transform\n",
    "X_train = ms_org.fit_transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对log数据缩放\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "# 构造输入特征的标准化器\n",
    "ms_log = MinMaxScaler()\n",
    "\n",
    "#保存特征名字，用于结果保存为csv\n",
    "feat_names_log = X_train_log.columns\n",
    "\n",
    "# 用训练训练模型（得到均值和标准差）：fit\n",
    "# 并对训练数据进行特征缩放：transform\n",
    "X_train_log = ms_log.fit_transform(X_train_log)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对tf-idf数据缩放\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "#保存特征名字，用于结果保存为csv\n",
    "feat_names_tfidf = X_train_tfidf.columns\n",
    "\n",
    "# 构造输入特征的标准化器\n",
    "ms_tfidf = MinMaxScaler()\n",
    "\n",
    "# 用训练训练模型（得到均值和标准差）：fit\n",
    "# 并对训练数据进行特征缩放：transform\n",
    "X_train_tfidf = ms_tfidf.fit_transform(X_train_tfidf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存原始特征\n",
    "y = pd.Series(data = y_train, name = 'target')\n",
    "feat_names = columns_org\n",
    "train_org = pd.concat([train_id, pd.DataFrame(columns = feat_names_org, data = X_train), y], axis = 1)\n",
    "train_org.to_csv(dpath +'Otto_FE_train_org.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存log特征变换结果\n",
    "y = pd.Series(data = y_train, name = 'target')\n",
    "train_log = pd.concat([train_id, pd.DataFrame(columns = feat_names_log, data = X_train_log), y], axis = 1)\n",
    "train_log.to_csv(dpath +'Otto_FE_train_log.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存tf-idf特征变换结果\n",
    "y = pd.Series(data = y_train, name = 'target')\n",
    "train_tfidf = pd.concat([train_id, pd.DataFrame(columns = feat_names_tfidf, data = X_train_tfidf), y], axis = 1)\n",
    "train_tfidf.to_csv(dpath +'Otto_FE_train_tfidf.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 保存特征编码过程中用到的模型，用于后续对测试数据的特征编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import _pickle as cPickle\n",
    "\n",
    "cPickle.dump(tfidf, open(dpath + \"tfidf.pkl\", 'wb'))\n",
    "\n",
    "cPickle.dump(ms_org, open(dpath + \"MinMaxSclaer_org.pkl\", 'wb'))\n",
    "cPickle.dump(ms_log, open(dpath + \"MinMaxSclaer_log.pkl\", 'wb'))\n",
    "cPickle.dump(ms_tfidf, open(dpath + \"MinMaxSclaer_tfidf.pkl\", 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "metadata": {
     "collapsed": false
    },
    "source": []
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
